Loading packages and subroutines

Packages

require(Seurat)
require(DT) # for datatable
require(SeuratWrappers) # for fastMNN
require(harmony) # for RunHarmony
require(reshape2) # for dcast

Import table

ImportTable <- function(file, header = T, row.names = 1, sep = ',', check.names = FALSE, ...){
  data <- read.csv(file = file, header = header, row.names = row.names, sep = sep, check.names = check.names, ...)
  return(data)
}

show_table

show_table <- function(df, rownames = T, filter="top", options = list(pageLength = 10, scrollX=T), ...){
  if (ncol(df) > 50) {
    df <- df[, 1:50]
    message('Due to the column dim is > 50, it only shows the ')
  }
  datatable(df, rownames = rownames, 
            filter=filter, options = options, ... )

}

MultMedFC

MultMedFC: Multiple Median Fold Change, compared each samples between 2 Stages (Control vs CRC)

MultMedFC <- function(data1, data2, FeatureAsRow = Tsf){
  if (FeatureAsRow) {
    data1 <- t(data1)
    data2 <- t(data2)
  }
  data1 <- as.data.frame(data1)
  data2 <- as.data.frame(data2)
  if (! all(colnames(data1) == colnames(data2))) {
    message("rowname of data1 and data2 is not fullly same!! plz check it !!")
    return()
  }
  
  sf_df <- matrix(NA, ncol = 3); sf_df <- sf_df[-1, ]
  for (sf in colnames(data1)) {
    sf_FC <- NULL
    if (sum(data1[[sf]] == 0) == nrow(data1) & sum(data2[[sf]] == 0) == nrow(data2)) {
      sf_df <- rbind(sf_df, c(1,1,1))
    }else{
      min_value <- min(c(data1[[sf]][data1[[sf]] != 0], data2[[sf]][data2[[sf]] != 0]))/10
      data1[[sf]][data1[[sf]] == 0] <- min_value
      data2[[sf]][data2[[sf]] == 0] <- min_value
      for (i1 in 1:nrow(data1)) {
        for (i2 in 1:nrow(data2)) {
          sf_FC <- c(sf_FC, data1[[sf]][i1]/data2[[sf]][i2])
        } 
      }
      sf_wil <- wilcox.test(sf_FC, conf.int = TRUE)
      sf_df <- rbind(sf_df, c(as.numeric(sf_wil$estimate), as.numeric(sf_wil$conf.int)))
    }
  }
  sf_df <- as.data.frame(sf_df); rownames(sf_df) <- colnames(data1); colnames(sf_df) <- c("Median", 'low-CI', 'high-CI')
  return(sf_df)
}

FileCreate

FileCreate <- function(DirPath = "./",Prefix = "ExampleTest", Suffix = "pdf", version = "0.0.0"){
  date=as.character(Sys.Date())
  DirPath = gsub("/$","",DirPath)
  Suffix = gsub('^[.]',"",Suffix)
  if(! dir.exists(DirPath)){
    dir.create(DirPath,recursive =TRUE)
  }
  if (version == "0.0.0") {
    version=100
    while(TRUE){
      version_=paste(unlist(strsplit(as.character(version),"")),collapse=".")
      DirPathName = paste0(DirPath,"/",date,'-',Prefix,'-v',version_,".",Suffix)
      if(! file.exists(DirPathName)){
        return(DirPathName)
        break
      }
      version = version + 1
    }
  }else{
    DirPathName = paste0(DirPath,"/",date,'-',Prefix,'-v',version_,".",Suffix)
    if(! dir.exists(DirPathName)){
      return(DirPathName)
    }
    return(DirPathName)
  }
}

Import data

Selected matrix

  • leave the features with p-value less than 0.05 in all cohorts,
  • select the feature with same trend in all cohorts
summarized_df <- ImportTable(file = '../01.SSTF/2021-07-07-Summary-walsh-median-v1.0.0.csv')
sel_df <- summarized_df[(summarized_df$P.count == 8 | summarized_df$N.count == 8), ] # 6
sel2_df <- summarized_df[(summarized_df$P.count >= 7 | summarized_df$N.count >= 7), ] # 66
sel3_df <- summarized_df[(summarized_df$P.count >= 6 | summarized_df$N.count >= 6), ] # 260

show_table(summarized_df)  %>%
  formatSignif(columns = colnames(summarized_df[,c(5:20)]),
               digits = 3, interval = 1) %>%
  formatRound(columns = colnames(summarized_df[,c(1:4)]),
               digits = 0, interval = 1)

Taxonomy name

tax_name <- ImportTable(file = '../00.ProcessData/2021-04-12-allTaxonomySplitLevel-v1.0.1.tsv', sep = '\t')
tax_name$new_name <- gsub(pattern = 's__', replacement = '', x = tax_name$Specie) %>% 
  gsub('_', ' ', . ) # due to Feature names of CreateSeuratObject cannot have underscores, replace with blanket space.
show_table(tax_name)

Taxonomy matrix

raw_tax_df <- ImportTable(file = '../00.ProcessData/2021-04-12-ReAbun-Fungi-filter1802-v1.0.0.tsv', sep = '\t')
# tax_df <- ImportTable(file = '../00.ProcessData/2021-04-12-RawData-Fungi-filter1802-v1.0.0.tsv', sep = '\t')
rownames(raw_tax_df) <- tax_name[rownames(raw_tax_df), "new_name"] # use the short species names
# show_table(raw_tax_df)
tax_df <- raw_tax_df[rownames(sel3_df),]
show_table(tax_df) %>%
  formatSignif(columns = colnames(tax_df)[1:50],
               digits = 3, interval = 1)
## Due to the column dim is > 50, it only shows the

Meta information

meta_df <- ImportTable(file = '../00.ProcessData/metaInfo-subgroup-v4.1.csv')
meta_df <- meta_df[colnames(tax_df),]
show_table(meta_df)

Remove the batch effect by Seurat

Create seurat object

CreateSeuratObject: Create a Seurat object link

s_obj <- CreateSeuratObject(counts = tax_df, meta.data = meta_df)

# split the object by cohorts
s_list <- SplitObject(s_obj, split.by = 'Cohort')

Remove batch effect

RPCA

Fast integration using reciprocal PCA (RPCA) link

Pre-process
calculateORnot <- T
if (calculateORnot) {
  for (i in 1:length(s_list)) {
      s_list[[i]] <- NormalizeData(s_list[[i]], verbose = FALSE)
      s_list[[i]] <- FindVariableFeatures(s_list[[i]], selection.method = "vst", nfeatures = 2000, verbose = FALSE)
  }
  
  s_anchors <- FindIntegrationAnchors(object.list = s_list, k.anchor = 20)
  
  s_integrated <- IntegrateData(anchorset = s_anchors, k.weight = 10) %>%
    ScaleData(. , verbose = FALSE) %>% 
    RunPCA(. , npcs = 30, verbose = FALSE) %>%
    RunUMAP(. , reduction = "pca", dims = 1:30, verbose = FALSE)
  
  prca_obj <- FindNeighbors(s_integrated, dims = 1:30) %>%
    FindClusters()
  
  prca_table <- dcast(as.data.frame(table(prca_obj@active.ident,prca_obj@meta.data[["Stage"]])), Var1 ~ Var2)
  rownames(prca_table) <- prca_table$Var1; prca_table <- prca_table[,-1]
  mod_file <- FileCreate(DirPath = '../02.BatchEffect/RPCA', Prefix = 'RPCA-model', Suffix = 'rds')
  saveRDS(object = prca_obj, file = mod_file)
  tab_file <- FileCreate(DirPath = '../02.BatchEffect/RPCA', Prefix = 'RPCA-summary', Suffix = 'csv')
  write.csv(x = prca_table, file = tab_file)
}else{
  prca_obj <- readRDS(file = '../02.BatchEffect/RPCA/2021-07-09-RPCA-model-v1.0.0.rds')
  prca_table <- ImportTable(file = '../02.BatchEffect/RPCA/2021-07-09-RPCA-summary-v1.0.0.csv')
}
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1802
## Number of edges: 72235
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7232
## Number of communities: 7
## Elapsed time: 0 seconds
Table
show_table(prca_table)
Figure
DimPlot(prca_obj, group.by = c("Cohort", "Stage", 'ident'), ncol = 1)

fastMNN

fastMNN: Batch effects in single-cell RNA-sequencing data are corrected by matching mutual nearest neighbors link
Nature Biotechnology, 2018, DOI: 10.1038/nbt.4091

Pre-process
calculateORnot <- T
if (calculateORnot) {
  fMNN_obj <- NormalizeData(s_obj) %>% 
    FindVariableFeatures() %>%
    SplitObject(. ,  split.by = 'Cohort') %>%
    RunFastMNN()
  
  fMNN_obj2 <- RunUMAP(fMNN_obj, reduction = 'mnn', dims = 1:30) %>%
    FindNeighbors(. , reduction = 'mnn', dims = 1:30) %>%
    FindClusters()
  
  fMNN_table <- dcast(as.data.frame(table(fMNN_obj2@active.ident,fMNN_obj2@meta.data[["Stage"]])), Var1 ~ Var2)
  rownames(fMNN_table) <- fMNN_table$Var1; fMNN_table <- fMNN_table[,-1]
  mod_file <- FileCreate(DirPath = '../02.BatchEffect/fastMNN', Prefix = 'fastMNN-model', Suffix = 'rds')
  saveRDS(object = fMNN_obj2, file = mod_file)
  tab_file <- FileCreate(DirPath = '../02.BatchEffect/fastMNN', Prefix = 'fastMNN-summary', Suffix = 'csv')
  write.csv(x = fMNN_table, file = tab_file)
}else{
  fMNN_obj2 <- readRDS(file = '../02.BatchEffect/fastMNN/2021-07-09-fastMNN-model-v1.0.0.rds')
  fMNN_table <- ImportTable(file = '../02.BatchEffect/fastMNN/2021-07-09-fastMNN-summary-v1.0.0.csv')
}
## Warning: Keys should be one or more alphanumeric characters followed by an
## underscore, setting key from mnn.reconstructed_ to mnnreconstructed_
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1802
## Number of edges: 79262
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.6388
## Number of communities: 7
## Elapsed time: 0 seconds
Table
show_table(fMNN_table)
Figure
DimPlot(fMNN_obj2, group.by = c("Cohort", "Stage", 'ident'), ncol = 1)

Harmony

Fast, sensitive, and flexible integration of single cell data with Harmony. link

Nature Methods, 2019, DOI: 10.1038/s41592-019-0619-0

Pre-process
calculateORnot <- T
if (calculateORnot) {
  hmy_obj <- NormalizeData(s_obj) %>%
    FindVariableFeatures() %>% 
    ScaleData() %>% 
    RunPCA(. , verbose = FALSE)
  
  hmy_obj2 <- RunHarmony(object = hmy_obj, group.by.vars = 'Cohort') %>%
    RunUMAP(., reduction = "harmony", dims = 1:30) %>%
    FindNeighbors(., reduction = "harmony", dims = 1:30) %>% 
    FindClusters()
  
  hmy_table <- dcast(as.data.frame(table(hmy_obj2@active.ident,hmy_obj2@meta.data[["Stage"]])), Var1 ~ Var2)
  rownames(hmy_table) <- hmy_table$Var1; hmy_table <- hmy_table[,-1]
  mod_file <- FileCreate(DirPath = '../02.BatchEffect/Harmony', Prefix = 'Harmony-model', Suffix = 'rds')
  saveRDS(object = hmy_obj2, file = mod_file)
  tab_file <- FileCreate(DirPath = '../02.BatchEffect/Harmony', Prefix = 'Harmony-summary', Suffix = 'csv')
  write.csv(x = hmy_table, file = tab_file)
}else{
  hmy_obj2 <- readRDS(file = '../02.BatchEffect/Harmony/2021-07-09-Harmony-model-v1.0.0.rds')
  hmy_table <- ImportTable(file = '../02.BatchEffect/Harmony/2021-07-09-Harmony-summary-v1.0.0.csv')
}
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1802
## Number of edges: 83057
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7194
## Number of communities: 8
## Elapsed time: 0 seconds
Table
show_table(hmy_table)
Figure
DimPlot(hmy_obj2, group.by = c("Cohort", "Stage", 'ident'), ncol = 1)